Similarty matrix



In [20]:

    
import numpy as np
import scipy as sc
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer



In [6]:

    
np.random.rand(3,2)









    Out[6]:





array([[ 0.20099636,  0.05064982],
       [ 0.99286728,  0.39290391],
       [ 0.1397375 ,  0.0186377 ]])



In [5]:

    
df = pd.read_csv('dengue_neo.csv')



In [6]:

    
df.head()









    Out[6]:






  
    
      
      Unnamed: 0
      Authors
      Title
      Year
      Source.title
      Volume
      Issue
      Art..No.
      Page.start
      Page.end
      ...
      Conference.code
      ISSN
      ISBN
      CODEN
      PubMed.ID
      Language.of.Original.Document
      Abbreviated.Source.Title
      Document.Type
      Source
      EID
    
  
  
    
      0
      1
      Gouvêa M.M.; Jr.
      Time-spatial model on the dynamics of the prol...
      2017.0
      Communications in Nonlinear Science and Numeri...
      44
      NaN
      NaN
      130
      143
      ...
      NaN
      10075704
      NaN
      NaN
      NaN
      English
      Comm. Nonlinear Sci. Numer. Simul.
      Article
      Scopus
      2-s2.0-84981507175
    
    
      1
      3
      Wang X.; Tang S.; Cheke R.A.
      A stage structured mosquito model incorporatin...
      2016.0
      Journal of Theoretical Biology
      411
      NaN
      NaN
      27
      36
      ...
      NaN
      225193
      NaN
      JTBIA
      NaN
      English
      J. Theor. Biol.
      Article
      Scopus
      2-s2.0-84991688005
    
    
      2
      4
      Tang B.; Xiao Y.; Tang S.; Wu J.
      Modelling weekly vector control against Dengue...
      2016.0
      Journal of Theoretical Biology
      410
      NaN
      NaN
      65
      76
      ...
      NaN
      225193
      NaN
      JTBIA
      NaN
      English
      J. Theor. Biol.
      Article
      Scopus
      2-s2.0-84988473946
    
    
      3
      5
      Delmelle E.; Hagenlocher M.; Kienberger S.; Ca...
      A spatial model of socioeconomic and environme...
      2016.0
      Acta Tropica
      164
      NaN
      NaN
      169
      176
      ...
      NaN
      0001706X
      NaN
      ACTRA
      NaN
      English
      Acta Trop.
      Article
      Scopus
      2-s2.0-84988037437
    
    
      4
      6
      Rodrigues H.S.; Monteiro M.T.T.; Torres D.F.M.
      Seasonality effects on dengue: basic reproduct...
      2016.0
      Mathematical Methods in the Applied Sciences
      39
      16
      NaN
      4671
      4679
      ...
      NaN
      1704214
      NaN
      MMSCD
      NaN
      English
      Math Methods Appl Sci
      Conference Paper
      Scopus
      2-s2.0-84920399968
    
  

5 rows × 42 columns



In [7]:

    
print(df.columns.values)









    



['Unnamed: 0' 'Authors' 'Title' 'Year' 'Source.title' 'Volume' 'Issue'
 'Art..No.' 'Page.start' 'Page.end' 'Page.count' 'Cited.by' 'DOI' 'Link'
 'Affiliations' 'Authors.with.affiliations' 'Abstract' 'Author.Keywords'
 'Index.Keywords' 'Molecular.Sequence.Numbers' 'Chemicals.CAS' 'Tradenames'
 'Manufacturers' 'Funding.Details' 'References' 'Correspondence.Address'
 'Editors' 'Sponsors' 'Publisher' 'Conference.name' 'Conference.date'
 'Conference.location' 'Conference.code' 'ISSN' 'ISBN' 'CODEN' 'PubMed.ID'
 'Language.of.Original.Document' 'Abbreviated.Source.Title' 'Document.Type'
 'Source' 'EID']



In [8]:



In [9]:

    
t0









    Out[9]:





'Some complex physical systems, such as cellular regulation, ecosystems, and societies, can be represented by local interactions between agents. Then, complex behaviors may emerge. A cellular automaton is a discrete dynamic system with these features. Among the several complex systems, epidemic diseases are given special attention by researchers with respect to their dynamics. Understanding the behavior of an epidemic may well benefit a society. For instance, different proliferation scenarios may be produced and a prevention policy set. This paper presents a new simulation method of the time-spatial spread of the Dengue mosquito with a cellular automaton. Thus, it will be possible to create different dissemination scenarios and preventive policies for these in several regions. Simulations were performed with different initial conditions and parameters as a result of which the behavior of the proposed method was characterized. © 2016 Elsevier B.V.'



In [10]:

    
tokens0 = nltk.wordpunct_tokenize(t0)



In [11]:

    
tokens0









    Out[11]:





['Some',
 'complex',
 'physical',
 'systems',
 ',',
 'such',
 'as',
 'cellular',
 'regulation',
 ',',
 'ecosystems',
 ',',
 'and',
 'societies',
 ',',
 'can',
 'be',
 'represented',
 'by',
 'local',
 'interactions',
 'between',
 'agents',
 '.',
 'Then',
 ',',
 'complex',
 'behaviors',
 'may',
 'emerge',
 '.',
 'A',
 'cellular',
 'automaton',
 'is',
 'a',
 'discrete',
 'dynamic',
 'system',
 'with',
 'these',
 'features',
 '.',
 'Among',
 'the',
 'several',
 'complex',
 'systems',
 ',',
 'epidemic',
 'diseases',
 'are',
 'given',
 'special',
 'attention',
 'by',
 'researchers',
 'with',
 'respect',
 'to',
 'their',
 'dynamics',
 '.',
 'Understanding',
 'the',
 'behavior',
 'of',
 'an',
 'epidemic',
 'may',
 'well',
 'benefit',
 'a',
 'society',
 '.',
 'For',
 'instance',
 ',',
 'different',
 'proliferation',
 'scenarios',
 'may',
 'be',
 'produced',
 'and',
 'a',
 'prevention',
 'policy',
 'set',
 '.',
 'This',
 'paper',
 'presents',
 'a',
 'new',
 'simulation',
 'method',
 'of',
 'the',
 'time',
 '-',
 'spatial',
 'spread',
 'of',
 'the',
 'Dengue',
 'mosquito',
 'with',
 'a',
 'cellular',
 'automaton',
 '.',
 'Thus',
 ',',
 'it',
 'will',
 'be',
 'possible',
 'to',
 'create',
 'different',
 'dissemination',
 'scenarios',
 'and',
 'preventive',
 'policies',
 'for',
 'these',
 'in',
 'several',
 'regions',
 '.',
 'Simulations',
 'were',
 'performed',
 'with',
 'different',
 'initial',
 'conditions',
 'and',
 'parameters',
 'as',
 'a',
 'result',
 'of',
 'which',
 'the',
 'behavior',
 'of',
 'the',
 'proposed',
 'method',
 'was',
 'characterized',
 '.',
 '©',
 '2016',
 'Elsevier',
 'B',
 '.',
 'V',
 '.']



In [12]:

    
nltk_to = nltk.Text(t0)



In [13]:

    
words = [w.lower() for w in nltk_to]
vocab = sorted(set(words))
type(vocab)









    Out[13]:





list



In [14]:

    
from nltk.corpus import stopwords
filtered_words_T0 = [w for w in tokens0 if not w in stopwords.words('english')]



In [15]:

    
print(filtered_words_T0)









    



['Some', 'complex', 'physical', 'systems', ',', 'cellular', 'regulation', ',', 'ecosystems', ',', 'societies', ',', 'represented', 'local', 'interactions', 'agents', '.', 'Then', ',', 'complex', 'behaviors', 'may', 'emerge', '.', 'A', 'cellular', 'automaton', 'discrete', 'dynamic', 'system', 'features', '.', 'Among', 'several', 'complex', 'systems', ',', 'epidemic', 'diseases', 'given', 'special', 'attention', 'researchers', 'respect', 'dynamics', '.', 'Understanding', 'behavior', 'epidemic', 'may', 'well', 'benefit', 'society', '.', 'For', 'instance', ',', 'different', 'proliferation', 'scenarios', 'may', 'produced', 'prevention', 'policy', 'set', '.', 'This', 'paper', 'presents', 'new', 'simulation', 'method', 'time', '-', 'spatial', 'spread', 'Dengue', 'mosquito', 'cellular', 'automaton', '.', 'Thus', ',', 'possible', 'create', 'different', 'dissemination', 'scenarios', 'preventive', 'policies', 'several', 'regions', '.', 'Simulations', 'performed', 'different', 'initial', 'conditions', 'parameters', 'result', 'behavior', 'proposed', 'method', 'characterized', '.', '©', '2016', 'Elsevier', 'B', '.', 'V', '.']



In [16]:

    
filtered_words_t0 = [w.lower() for w in filtered_words_T0 if w.isalnum()]
print(filtered_words_t0)









    



['some', 'complex', 'physical', 'systems', 'cellular', 'regulation', 'ecosystems', 'societies', 'represented', 'local', 'interactions', 'agents', 'then', 'complex', 'behaviors', 'may', 'emerge', 'a', 'cellular', 'automaton', 'discrete', 'dynamic', 'system', 'features', 'among', 'several', 'complex', 'systems', 'epidemic', 'diseases', 'given', 'special', 'attention', 'researchers', 'respect', 'dynamics', 'understanding', 'behavior', 'epidemic', 'may', 'well', 'benefit', 'society', 'for', 'instance', 'different', 'proliferation', 'scenarios', 'may', 'produced', 'prevention', 'policy', 'set', 'this', 'paper', 'presents', 'new', 'simulation', 'method', 'time', 'spatial', 'spread', 'dengue', 'mosquito', 'cellular', 'automaton', 'thus', 'possible', 'create', 'different', 'dissemination', 'scenarios', 'preventive', 'policies', 'several', 'regions', 'simulations', 'performed', 'different', 'initial', 'conditions', 'parameters', 'result', 'behavior', 'proposed', 'method', 'characterized', '2016', 'elsevier', 'b', 'v']



In [19]:

    
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(filtered_words_t0)
X_train_counts









    Out[19]:





<91x73 sparse matrix of type '<class 'numpy.int64'>'
	with 88 stored elements in Compressed Sparse Row format>



In [21]:

    
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape









    Out[21]:





(91, 73)



In [22]:

    
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape









    Out[22]:





(91, 73)



In [30]:

    
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

train_set = df['Abstract'].tolist()

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)  #finds the tfidf score with normalization
print ("cosine scores ==> ",cosine_similarity(tfidf_matrix_train, tfidf_matrix_train))









    



cosine scores ==>  [[ 1.          0.12911647  0.09299782 ...,  0.10117668  0.16888597
   0.04722414]
 [ 0.12911647  1.          0.22658251 ...,  0.13396575  0.17346689
   0.05257748]
 [ 0.09299782  0.22658251  1.         ...,  0.08378385  0.15058275
   0.05015608]]



In [31]:

    
rtn = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)



In [37]:

    
print(rtn[0])









    



[ 1.          0.12911647  0.09299782 ...,  0.10117668  0.16888597
  0.04722414]



In [40]:

    
np.savetxt('docsim10.csv', rtn[0:10])



In [ ]:

	Unnamed: 0	Authors	Title	Year	Source.title	Volume	Issue	Art..No.	Page.start	Page.end	...	Conference.code	ISSN	ISBN	CODEN	PubMed.ID	Language.of.Original.Document	Abbreviated.Source.Title	Document.Type	Source	EID
0	1	Gouvêa M.M.; Jr.	Time-spatial model on the dynamics of the prol...	2017.0	Communications in Nonlinear Science and Numeri...	44	NaN	NaN	130	143	...	NaN	10075704	NaN	NaN	NaN	English	Comm. Nonlinear Sci. Numer. Simul.	Article	Scopus	2-s2.0-84981507175
1	3	Wang X.; Tang S.; Cheke R.A.	A stage structured mosquito model incorporatin...	2016.0	Journal of Theoretical Biology	411	NaN	NaN	27	36	...	NaN	225193	NaN	JTBIA	NaN	English	J. Theor. Biol.	Article	Scopus	2-s2.0-84991688005
2	4	Tang B.; Xiao Y.; Tang S.; Wu J.	Modelling weekly vector control against Dengue...	2016.0	Journal of Theoretical Biology	410	NaN	NaN	65	76	...	NaN	225193	NaN	JTBIA	NaN	English	J. Theor. Biol.	Article	Scopus	2-s2.0-84988473946
3	5	Delmelle E.; Hagenlocher M.; Kienberger S.; Ca...	A spatial model of socioeconomic and environme...	2016.0	Acta Tropica	164	NaN	NaN	169	176	...	NaN	0001706X	NaN	ACTRA	NaN	English	Acta Trop.	Article	Scopus	2-s2.0-84988037437
4	6	Rodrigues H.S.; Monteiro M.T.T.; Torres D.F.M.	Seasonality effects on dengue: basic reproduct...	2016.0	Mathematical Methods in the Applied Sciences	39	16	NaN	4671	4679	...	NaN	1704214	NaN	MMSCD	NaN	English	Math Methods Appl Sci	Conference Paper	Scopus	2-s2.0-84920399968